Llibreries
library(tidyverse)
library(funModeling)
library(lubridate)
library(weathermetrics)
library(factoextra)
library(corrplot)
library(qgraph)
library(kableExtra)
library(scales)
library(plotly)
library(broom)
library(tidymodels)
Dades
machine <- read_csv("machine_temperature_system_failure.csv")
df_status(machine)
## variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
## 1 timestamp 0 0 0 0 0 0 POSIXct/POSIXt 22683
## 2 value 0 0 0 0 0 0 numeric 22695
machine <-
machine %>%
mutate(temp = fahrenheit.to.celsius(value, round = 2))
ggplot(machine, aes(x = timestamp, y=temp))+
geom_line(color="grey30")+
theme_minimal()+
labs(title = "Registre Temperatura. (5s.)", x=" ", y="Temperatura (?C)")+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))

Feature Engineering
machine <-
machine %>%
mutate(hora = hour(timestamp),
torn = ifelse(hora > 7 & hora < 22, "dia", "nit"),
diaSet = wday(timestamp, week_start = getOption("lubridate.week.start", 1)),
jour = date(timestamp),
wEnd = ifelse(diaSet %in% c(1,2,3,4,5), "SET", "CAP")) %>%
group_by(wEnd, torn) %>%
mutate(grup = paste(wEnd, torn, sep = "_")) %>%
ungroup() %>%
mutate(grup = as_factor(grup))
ggplot(machine, aes(x = timestamp, y=temp, color= grup))+
geom_line()+
theme_minimal()+
labs(title = "Registre Temperatura. (5s.)", x=" ", y="Temperatura (?C)")+
theme(legend.position = "none",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))+
facet_wrap(~grup)

ggplot(machine, aes(x=temp, fill= grup))+
geom_histogram(color= "white")+
theme_minimal()+
labs(title = "Registre Temperatura. (5s.)", x=" ", y="Temperatura (?C)")+
theme(legend.position = "none",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))+
facet_wrap(~grup, scales = "free")

ggplot(machine, aes(y=temp, fill= grup))+
geom_boxplot()+
theme_minimal()+
labs(title = "Registre Temperatura. (5s.)", x=" ", y="Temperatura (?C)")+
theme(legend.position = "top",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5),
axis.text.x = element_blank())

Principal Components
acp <-prcomp(machine_num, center=T, scale.=T) # PCA + estandatització de variables
acp
## Standard deviations (1, .., p=5):
## [1] 1.3393728 1.2327473 0.9994110 0.6964306 0.4500852
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4 PC5
## temp 0.0500410651 0.119989562 -0.98525101 0.07394691 0.083130600
## hora 0.0009793380 -0.698735979 -0.13852758 -0.70177270 -0.009605317
## torn_n 0.0003314771 -0.705195736 -0.03150325 0.70816668 0.014365251
## wEnd_n -0.7047566596 0.008271019 -0.09270506 0.01870399 -0.703068683
## diaSet 0.7076813362 0.001049491 -0.02244710 0.01403727 -0.706034778
# eigenvalue: valors superiors a 1, varian?a percentual acumulada
get_eigenvalue(acp)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 1.7939196 35.878392 35.87839
## Dim.2 1.5196658 30.393316 66.27171
## Dim.3 0.9988223 19.976447 86.24815
## Dim.4 0.4850156 9.700312 95.94847
## Dim.5 0.2025767 4.051533 100.00000
# gr?fic varian?a acumulada
fviz_eig(acp, addlabels = TRUE, ylim = c(0, 40))

actives <-
as.data.frame(acp$x[,1:3]) %>%
select('PC1_dia' = 'PC1', 'PC2_hora'= "PC2", 'PC3_temp'= "PC3")
plot(acp$x[,1], acp$x[,3], xlab="PC1_dia", ylab="PC3_temp")
abline(h=0,v=0,col="gray60")

plot(acp$x[,2], acp$x[,3], xlab="PC2_hora", ylab="PC3_temp")
abline(h=0,v=0,col="gray60")

Correlacions
kor <- cor(machine_num, acp$x[,1:3])
kor %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", full_width= F))
|
|
PC1
|
PC2
|
PC3
|
|
temp
|
0.0670236
|
0.1479168
|
-0.9846707
|
|
hora
|
0.0013117
|
-0.8613649
|
-0.1384460
|
|
torn_n
|
0.0004440
|
-0.8693281
|
-0.0314847
|
|
wEnd_n
|
-0.9439319
|
0.0101961
|
-0.0926505
|
|
diaSet
|
0.9478492
|
0.0012938
|
-0.0224339
|
corrplot(kor)

# rotacions
acp_rot<-varimax(kor)
acp_rot$loadings
##
## Loadings:
## PC1 PC2 PC3
## temp -0.997
## hora -0.872
## torn_n -0.867
## wEnd_n -0.947
## diaSet 0.946
##
## PC1 PC2 PC3
## SS loadings 1.792 1.512 1.008
## Proportion Var 0.358 0.302 0.202
## Cumulative Var 0.358 0.661 0.862
Dendograma
dd <- dist(actives, method = "euclidean")
hc.ward <- hclust(dd, method = "ward.D2")
plot(hc.ward, hang = -1, cex = 0.5)
rect.hclust(hc.ward, k=11, border="red")

k_15 <- cutree(hc.ward, k = 11)
# assignaci? de cluster segons hcust
actives$k <- as.factor(k_15)
prop <- prop.table(table(k_15))
round(prop,2)
## k_15
## 1 2 3 4 5 6 7 8 9 10 11
## 0.08 0.11 0.13 0.08 0.02 0.11 0.11 0.08 0.08 0.18 0.02
table(k_15)
## k_15
## 1 2 3 4 5 6 7 8 9 10 11
## 1914 2521 2909 1738 376 2587 2579 1735 1920 3984 432
Gr?fic Cluster HCLUST
actives %>%
mutate(k= as.factor(k)) %>%
# filter(k %in% c(1,2,3,4)) %>%
ggplot()+
geom_point(aes(x = PC1_dia, y = PC3_temp, col = k))+
theme_bw()+
labs(title = "Clusters", x="PC1_dia", y="PC3_temp")+
theme(legend.position = "none",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))

#facet_wrap(~k)
# clusters dendograma
actives %>%
mutate(k= as.factor(k)) %>%
# filter(k %in% c(1,2,3,4)) %>%
ggplot()+
geom_point(aes(x = PC2_hora, y = PC3_temp, col = k))+
theme_bw()+
labs(title = "Clusters", x="PC2_hora", y="PC3_temp")+
theme(legend.position = "none",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))

Centres HCLUST
cdg <-
actives %>%
dplyr::group_by(k) %>%
dplyr::summarise(c_dia=mean(PC1_dia),
c_hora=mean(PC2_hora),
c_temp=mean(PC3_temp)) %>%
select(-k)
head(cdg)
## # A tibble: 6 x 3
## c_dia c_hora c_temp
## <dbl> <dbl> <dbl>
## 1 -1.21 -0.596 0.210
## 2 -1.11 1.66 0.151
## 3 -0.424 -1.16 -0.568
## 4 -0.918 -1.08 1.77
## 5 -1.39 1.42 2.07
## 6 -0.374 1.69 -0.379
KMEANS
k_mean <- kmeans(actives[,1:3], centers = cdg)
# k_mean
# table(k_mean$cluster)
# k_mean$size
# table(actives$k, k_mean$cluster)
# assignaci? de cluster segons kmean
actives$kmean <-as.factor(k_mean$cluster)
machine$kmean <-as.factor(k_mean$cluster)
#distribuci? cluster, size
k_mean %>%
tidy() %>%
select(cluster, size) %>%
kable(caption = "Distribuci? clusters") %>%
kable_styling(full_width = F)
Distribuci? clusters
|
cluster
|
size
|
|
1
|
1700
|
|
2
|
2201
|
|
3
|
2412
|
|
4
|
1344
|
|
5
|
676
|
|
6
|
2607
|
|
7
|
2870
|
|
8
|
2549
|
|
9
|
1919
|
|
10
|
3927
|
|
11
|
490
|
Comparaci? K-MEANS / HCLUST
c_m <- conf_mat(actives, k, kmean, dnn= c("kmean", "hclust"))
c_m %>%
pluck(1) %>%
as_tibble() %>%
ggplot(aes(hclust, kmean, alpha = n)) +
geom_tile(show.legend = FALSE, fill = "blue", color = "white") +
geom_text(aes(label = n), colour = "white", alpha = 1, size = 6)+
labs(title = "Confusion Matrix hclust-kmean" )+
theme_minimal()+
theme(legend.position = "none",
panel.grid.major.x =element_blank(),
panel.grid.major.y =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))

accu <-
summary(c_m) %>%
select(-.estimator) %>%
filter(.metric %in% c("accuracy"))
percent(accu$.estimate)
## [1] "84.8%"
# index d'estabilitat 84.84%
Gr?fic Clusters K-MEANS
# plot clusters ggplot
actives %>%
mutate(k= as.factor(k)) %>%
ggplot()+
geom_point(aes(x = PC1_dia, y = PC3_temp, col = k))+
theme_bw()+
labs(title = "Clusters", x="PC1_dia", y="PC3_temp")+
theme(legend.position = "none",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))

#facet_wrap(~k)
actives %>%
mutate(k= as.factor(k)) %>%
ggplot()+
geom_point(aes(x = PC2_hora, y = PC3_temp, col = k))+
theme_bw()+
labs(title = "Clusters", x="PC2_hora", y="PC3_temp")+
theme(legend.position = "none",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))

#facet_wrap(~k)
Gr?fic 3-D clusters K-MEANS
plot_ly(x=actives$PC1_dia, y=actives$PC2_hora, z=actives$PC3_temp, type="scatter3d", mode="markers", color=actives$kmean)
Centres K-MEANS
# tots els centres de cada punt
# k_mean$centers[k_mean$cluster, ]
# fitted(k_mean)
# head(k_mean$centers)
centroids <-
as.data.frame(fitted(k_mean)) %>% #resum dels centres
select('PC1_cen' = 'PC1_dia', 'PC2_cen'= "PC2_hora", 'PC3_cen'= "PC3_temp")
head(centroids)
## PC1_cen PC2_cen PC3_cen
## X1 -1.05906 -0.7869974 0.5792558
## X1.1 -1.05906 -0.7869974 0.5792558
## X1.2 -1.05906 -0.7869974 0.5792558
## X1.3 -1.05906 -0.7869974 0.5792558
## X1.4 -1.05906 -0.7869974 0.5792558
## X1.5 -1.05906 -0.7869974 0.5792558
#assignaci? centres de clusters
actives$PC1_cen <- centroids$PC1_cen
actives$PC2_cen <- centroids$PC2_cen
actives$PC3_cen <- centroids$PC3_cen
# plot clusters amb centres
actives %>%
mutate(kmean= as.factor(kmean)) %>%
ggplot()+
geom_point(aes(x = PC1_dia, y = PC3_temp, col = kmean), alpha = 0.5)+
geom_point(aes(x = PC1_cen, y = PC3_cen), color = "black", shape = 13, size = 5)+
theme_bw()+
labs(title = "Clusters dia-temp", x="PC1_dia", y="PC3_temp")+
theme(legend.position = "none",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))+
facet_wrap(~kmean)

actives %>%
mutate(kmean= as.factor(kmean)) %>%
ggplot()+
geom_point(aes(x = PC2_hora, y = PC3_temp, col = kmean), alpha = 0.5)+
geom_point(aes(x = PC2_cen, y = PC3_cen), color = "black", shape = 13, size = 5)+
theme_bw()+
labs(title = "Clusters hora-temp", x="PC2_hora", y="PC3_temp")+
theme(legend.position = "none",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))+
facet_wrap(~kmean)

Dist?ncies
# Distancia Euclidea al Centroid
actives <-
actives %>%
mutate(dist_c = sqrt( (PC1_dia - PC1_cen)^2 + (PC2_hora - PC2_cen)^2 + (PC3_temp - PC3_cen)^2))
# detectar outliers de les distancies
anomalies <-
actives %>%
dplyr::group_by(kmean) %>%
dplyr::mutate(iqr = IQR(dist_c),
q_75 = quantile(dist_c, probs = 0.75),
out = ifelse(dist_c > q_75 + iqr * 1.5, 1, 0)) %>%
ungroup()
head(anomalies)
## # A tibble: 6 x 12
## PC1_dia PC2_hora PC3_temp k kmean PC1_cen PC2_cen PC3_cen dist_c
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 -1.53 -1.66 0.615 1 1 -1.06 -0.787 0.579 0.993
## 2 -1.53 -1.65 0.546 1 1 -1.06 -0.787 0.579 0.984
## 3 -1.52 -1.64 0.461 1 1 -1.06 -0.787 0.579 0.979
## 4 -1.52 -1.62 0.317 1 1 -1.06 -0.787 0.579 0.988
## 5 -1.51 -1.61 0.232 1 1 -1.06 -0.787 0.579 1.00
## 6 -1.51 -1.62 0.276 1 1 -1.06 -0.787 0.579 0.995
## # ... with 3 more variables: iqr <dbl>, q_75 <dbl>, out <dbl>
# total de outliers per cluster
anomalies %>%
dplyr::summarise(Total = sum(out),
Percentatge = percent(Total / nrow(actives))) %>%
kable(caption = "Total Anomalies") %>%
kable_styling(full_width = F)
Total Anomalies
|
Total
|
Percentatge
|
|
331
|
1.46%
|
# assignar anomalies a la taula actives
actives$out <- as.factor(anomalies$out)
machine$out <- anomalies$out
Outliers de les Dist?ncies
# dispersi? de clusters i possibles outliers
actives %>%
mutate(kmean = as.factor(kmean)) %>%
ggplot(aes(x = kmean, y=dist_c, color= as.factor(kmean)))+
geom_boxplot(outlier.shape = NA, varwidth = TRUE)+
geom_jitter(data = subset(actives, out == 1), aes(color= as.factor(kmean)), alpha = 0.5)+
theme_minimal()+
labs(title = "Ditancia Euclidea al Centroid", x=" ", y="")+
theme(legend.position = "none",
panel.grid.major.x =element_blank(),
panel.grid.major.y =element_blank(),
panel.grid.minor.y =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5),
axis.text.y = element_blank())

Clusters amb outliers
# plot clusters amb outlir
actives %>%
ggplot()+
geom_point(aes(x = PC1_dia, y = PC3_temp, col = out), alpha = 0.5)+
geom_point(aes(x = PC1_cen, y = PC3_cen), color = "black", shape = 13, size = 5)+
scale_color_manual(values=c("gray70","red"))+
theme_bw()+
labs(title = "Clusters dia-temp", x="PC1_dia", y="PC3_temp")+
theme(legend.position = "none",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))+
facet_wrap(~kmean)

actives %>%
ggplot()+
geom_point(aes(x = PC2_hora, y = PC3_temp, color = out), alpha = 0.4)+
geom_point(aes(x = PC2_cen, y = PC3_cen), color = "black", shape = 13, size = 5)+
scale_color_manual(values=c("gray70","red"))+
theme_bw()+
labs(title = "Clusters PC2-PC3", x="PC2_hora", y="PC3_temp")+
theme(legend.position = "none",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))+
facet_wrap(~kmean)

Gr?fic 3D cluster 1
clusty <- 1
mig <- c(k_mean$centers[clusty, ],clusty,2)
cluster_sel <-
actives %>%
filter(kmean == clusty) %>%
select(PC1_dia, PC2_hora, PC3_temp, kmean, anomal= out) %>%
mutate(anomal = fct_expand(anomal, "2")) %>%
rbind(mig)
colors <- c('#c6cbcc', '#ba072b', '#46c4eb')
plot_ly(cluster_sel, x= ~PC1_dia, y= ~PC2_hora, z= ~PC3_temp,
type="scatter3d", mode="markers", color = ~anomal, colors = colors)
Distribuci? horaria de les anomalies
# distribuci? horaria de les anomalies
machine %>%
filter(out == 1) %>%
ggplot(aes(x=hora, fill= grup))+
geom_histogram(color= "white")+
theme_bw()+
labs(title = "Anomalies x hores", x=" ", y="")+
theme(legend.position = "none",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))+
facet_wrap(~grup)

Distribuci? diaria de les anomalies
# distribuci? diaria de les anomalies
machine %>%
filter(out == 1) %>%
ggplot(aes(x=diaSet, fill= grup))+
geom_histogram(color= "white")+
theme_bw()+
labs(title = "Aanomalies x dies", x=" ", y="")+
theme(legend.position = "top",
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))

Linia temporal amb anomalies
machine %>%
ggplot(aes(x = timestamp, y=temp), alpha = 0.4)+
geom_point(color="grey70")+
geom_line(color="grey70")+
geom_point(data = subset(machine, out == 1), color = "red", size = 3)+
theme_minimal()+
labs(title = "Registre Temperatura. (5s.)", x=" ", y="Temperatura (?C)")+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))

Per grups:
# per grups
machine %>%
ggplot(aes(x = timestamp, y=temp), alpha = 0.4)+
geom_point(color="grey70")+
geom_line(color="grey70")+
geom_point(data = subset(machine, out == 1), color = "red", size = 3)+
theme_minimal()+
labs(title = "Registre Temperatura. (5s.)", x=" ", y="Temperatura (?C)")+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))+
facet_wrap(~grup)

Per dies:
# per dies
machine %>%
ggplot(aes(x = timestamp, y=temp), alpha = 0.4)+
geom_point(color="grey70")+
# geom_line(color="grey70")+
geom_point(data = subset(machine, out == 1), color = "red", size = 3)+
theme_bw()+
labs(title = "Registre Temperatura. (5s.)", x=" ", y="Temperatura (?C)")+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank(),
panel.grid.major.x =element_blank(),
title = element_text(color= "dodgerblue3"),
plot.title = element_text(hjust=0.5))+
facet_wrap(~wday(timestamp, week_start = getOption("lubridate.week.start", 1)), ncol = 2)

Tots els dilluns:
sel_dia <-
machine %>%
filter(diaSet == 1)
plt <-
sel_dia %>%
ggplot(aes(x = timestamp, y=temp, text= paste("dia=", diaSet, "\n")), alpha = 0.4)+
geom_point(color="grey70")+
geom_point(data = subset(sel_dia, out == 1), color = "red", size = 3 )+
theme_minimal()
ggplotly(plt)
Dilluns 16-12-2013:
dia_treball <-
machine %>%
filter(diaSet == 1) %>%
# filter(jour == "2013-12-16")
filter(jour == "2013-12-16")
dia_treball %>%
ggplot(aes(x = timestamp, y=temp), alpha = 0.4)+
geom_point(color="grey70")+
geom_line(color="grey70")+
geom_point(data = subset(dia_treball, out == 1), color = "red", size = 3 )+
theme_minimal()

saveRDS(actives, file = "actives.rds")
saveRDS(k_mean$centers, file = "centres.rds")
saveRDS(machine, file = "machine.rds")